{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "import math\n",
    "from copy import deepcopy\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "\n",
    "from tqdm import tqdm\n",
    "from unidecode import unidecode \n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# saola\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'saola'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/swg-camera-traps/'\n",
    "path_prefix = 'public'\n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 1a - If you have metadata in COCO Camera Traps (CCT) format already...\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path to the CCT json, or a loaded json object\n",
    "path_to_image_cct = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/swg_camera_traps.json'  # set to None if not available\n",
    "path_to_bbox_cct = None\n",
    "assert not (path_to_image_cct is None and path_to_bbox_cct is None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading image DB...\n",
      "Number of items from the image DB: 2039657\n",
      "Number of images with more than 1 species: 0 (0.0% of image DB)\n",
      "No bbox DB provided.\n",
      "CPU times: user 27 s, sys: 7.01 s, total: 34 s\n",
      "Wall time: 1min 15s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "embedded = make_cct_embedded(image_db=path_to_image_cct, bbox_db=path_to_bbox_cct)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the following step, properties will be moved to the highest level that is still correct, i.e. if a property at the image-level always has the smae value for all images in a sequence, it will be moved to be a sequence-level property.\n",
    "\n",
    "If a sequence-level property has the same value throughout this dataset (often 'rights holder'), it will be removed from the `sequence` objects. A message about this will be printed, and you should add that property and its (constant) value to this dataset's entry in the `datasets` table."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The dataset_name is set to saola. Please make sure this is correct!\n",
      "Making a deep copy of docs...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  2%|▏         | 46906/2039657 [00:00<00:04, 468968.76it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Putting 2039657 images into sequences...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2039657/2039657 [00:05<00:00, 377104.84it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of sequences: 436617\n",
      "Checking the location field...\n",
      "Checking which fields in a CCT image entry are sequence-level...\n",
      "\n",
      "all_img_properties\n",
      "{'location', 'class', 'frame_num', 'id', 'file', 'datetime'}\n",
      "\n",
      "img_level_properties\n",
      "{'class', 'frame_num', 'id', 'file', 'datetime'}\n",
      "\n",
      "image-level properties that really should be sequence-level\n",
      "{'location'}\n",
      "\n",
      "Finished processing sequences.\n",
      "Example sequence items:\n",
      "\n",
      "{\"dataset\": \"saola\", \"seq_id\": \"0a9bb757-8c2a-11eb-9411-000d3a74c7de\", \"location\": \"loc_0000\", \"images\": [{\"id\": \"c7a2830c-8c29-11eb-860b-000d3a74c7de\", \"datetime\": \"2017-08-15 11:48:11+00:00\", \"frame_num\": 0, \"file\": \"private/lao/loc_0000/2017/08/image_00000.jpg\", \"class\": [\"ignore\"]}, {\"id\": \"c7a2830d-8c29-11eb-854d-000d3a74c7de\", \"datetime\": \"2017-08-15 11:48:19+00:00\", \"frame_num\": 1, \"file\": \"private/lao/loc_0000/2017/08/image_00001.jpg\", \"class\": [\"ignore\"]}]}\n",
      "\n",
      "{\"dataset\": \"saola\", \"seq_id\": \"06f05625-8c2a-11eb-bb0b-000d3a74c7de\", \"location\": \"loc_0198\", \"images\": [{\"id\": \"ce466aae-8c29-11eb-8de0-000d3a74c7de\", \"datetime\": \"2018-09-18 13:04:52+00:00\", \"frame_num\": 0, \"file\": \"public/lao/loc_0198/2018/09/image_00135.jpg\", \"class\": [\"stump_tailed_macaque\"]}, {\"id\": \"ce466aaf-8c29-11eb-a0f0-000d3a74c7de\", \"datetime\": \"2018-09-18 13:04:55+00:00\", \"frame_num\": 1, \"file\": \"public/lao/loc_0198/2018/09/image_00136.jpg\", \"class\": [\"stump_tailed_macaque\"]}, {\"id\": \"ce466ab0-8c29-11eb-a7e0-000d3a74c7de\", \"datetime\": \"2018-09-18 13:04:58+00:00\", \"frame_num\": 2, \"file\": \"public/lao/loc_0198/2018/09/image_00137.jpg\", \"class\": [\"stump_tailed_macaque\"]}, {\"id\": \"ce466ab1-8c29-11eb-aa0f-000d3a74c7de\", \"datetime\": \"2018-09-18 13:05:01+00:00\", \"frame_num\": 3, \"file\": \"public/lao/loc_0198/2018/09/image_00138.jpg\", \"class\": [\"stump_tailed_macaque\"]}, {\"id\": \"ce466ab2-8c29-11eb-b011-000d3a74c7de\", \"datetime\": \"2018-09-18 13:05:07+00:00\", \"frame_num\": 4, \"file\": \"public/lao/loc_0198/2018/09/image_00139.jpg\", \"class\": [\"stump_tailed_macaque\"]}, {\"id\": \"ce466ab3-8c29-11eb-b0d5-000d3a74c7de\", \"datetime\": \"2018-09-18 13:05:09+00:00\", \"frame_num\": 5, \"file\": \"public/lao/loc_0198/2018/09/image_00140.jpg\", \"class\": [\"stump_tailed_macaque\"]}, {\"id\": \"ce466ab4-8c29-11eb-a717-000d3a74c7de\", \"datetime\": \"2018-09-18 13:05:12+00:00\", \"frame_num\": 6, \"file\": \"public/lao/loc_0198/2018/09/image_00141.jpg\", \"class\": [\"stump_tailed_macaque\"]}, {\"id\": \"ce466ab5-8c29-11eb-b376-000d3a74c7de\", \"datetime\": \"2018-09-18 13:05:14+00:00\", \"frame_num\": 7, \"file\": \"public/lao/loc_0198/2018/09/image_00142.jpg\", \"class\": [\"stump_tailed_macaque\"]}]}\n",
      "\n",
      "CPU times: user 1min 25s, sys: 2.31 s, total: 1min 27s\n",
      "Wall time: 1min 27s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences = process_sequences(embedded, dataset_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[OrderedDict([('dataset', 'saola'),\n",
       "              ('seq_id', '1a2ad85d-8c2a-11eb-9418-000d3a74c7de'),\n",
       "              ('location', 'loc_0762'),\n",
       "              ('images',\n",
       "               [{'id': 'd77a05b8-8c29-11eb-be04-000d3a74c7de',\n",
       "                 'datetime': '2018-05-25 13:50:42+00:00',\n",
       "                 'frame_num': 0,\n",
       "                 'file': 'private/lao/loc_0762/2018/05/image_00004.jpg',\n",
       "                 'class': ['ignore']},\n",
       "                {'id': 'd77a05b9-8c29-11eb-addf-000d3a74c7de',\n",
       "                 'datetime': '2018-05-25 13:50:50+00:00',\n",
       "                 'frame_num': 1,\n",
       "                 'file': 'private/lao/loc_0762/2018/05/image_00005.jpg',\n",
       "                 'class': ['ignore']},\n",
       "                {'id': 'd77a05ba-8c29-11eb-ba32-000d3a74c7de',\n",
       "                 'datetime': '2018-05-25 13:50:58+00:00',\n",
       "                 'frame_num': 2,\n",
       "                 'file': 'private/lao/loc_0762/2018/05/image_00006.jpg',\n",
       "                 'class': ['ignore']},\n",
       "                {'id': 'd77a05bb-8c29-11eb-b3cc-000d3a74c7de',\n",
       "                 'datetime': '2018-05-25 13:51:04+00:00',\n",
       "                 'frame_num': 3,\n",
       "                 'file': 'private/lao/loc_0762/2018/05/image_00007.jpg',\n",
       "                 'class': ['ignore']}])]),\n",
       " OrderedDict([('dataset', 'saola'),\n",
       "              ('seq_id', 'fcbf8690-8c29-11eb-9ca5-000d3a74c7de'),\n",
       "              ('location', 'loc_0016'),\n",
       "              ('images',\n",
       "               [{'id': 'c80dc134-8c29-11eb-8989-000d3a74c7de',\n",
       "                 'datetime': '2017-10-30 12:49:18+00:00',\n",
       "                 'frame_num': 0,\n",
       "                 'file': 'public/lao/loc_0016/2017/10/image_01197.jpg',\n",
       "                 'class': ['blurred']},\n",
       "                {'id': 'c80dc135-8c29-11eb-b911-000d3a74c7de',\n",
       "                 'datetime': '2017-10-30 12:49:19+00:00',\n",
       "                 'frame_num': 1,\n",
       "                 'file': 'public/lao/loc_0016/2017/10/image_01198.jpg',\n",
       "                 'class': ['blurred']},\n",
       "                {'id': 'c80dc136-8c29-11eb-966e-000d3a74c7de',\n",
       "                 'datetime': '2017-10-30 12:49:20+00:00',\n",
       "                 'frame_num': 2,\n",
       "                 'file': 'public/lao/loc_0016/2017/10/image_01199.jpg',\n",
       "                 'class': ['blurred']}])])]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sample some sequences to make sure they are what you expect\n",
    "sample(sequences, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "public_sequences = []  # actually public/private was decided at the image level\n",
    "\n",
    "for seq in sequences:\n",
    "    public_images = [im for im in seq['images'] if im['file'].startswith('public/')]\n",
    "    if len(public_images) > 0:\n",
    "        for im in public_images:\n",
    "            im['file'] = im['file'].split('public/')[1]\n",
    "        seq['images'] = public_images\n",
    "        public_sequences.append(seq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "410464"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(public_sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "935"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "locations = set()\n",
    "for seq in public_sequences:\n",
    "    locations.add(seq['location'])\n",
    "len(locations)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 2min 38s, sys: 3.59 ms, total: 2min 38s\n",
      "Wall time: 2min 38s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(public_sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(public_sequences, f, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - Sample"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "222309"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "926"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "public_sequences_good = []\n",
    "locations_good = set()\n",
    "\n",
    "for seq in public_sequences:\n",
    "    include = True\n",
    "    for im in seq['images']:\n",
    "        if im['class'][0] in ['ignore', 'empty', 'problem', 'blurred']:\n",
    "            include = False\n",
    "    if include:\n",
    "        public_sequences_good.append(seq)\n",
    "        locations_good.add(seq['location'])\n",
    "len(public_sequences_good)\n",
    "len(locations_good)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "119288"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "913"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sequences_short = []\n",
    "locations_sampled = set()\n",
    "\n",
    "# sample 50k sequences, not covering all locations but most, and take only 3 frames\n",
    "for seq in sample(public_sequences_good, 50000):\n",
    "    seq['images'] = seq['images'][:3]\n",
    "    sequences_short.append(seq)\n",
    "    locations_sampled.add(seq['location'])\n",
    "\n",
    "num_images = sum([len(seq['images']) for seq in sequences_short])\n",
    "num_images\n",
    "\n",
    "len(locations_sampled)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[OrderedDict([('dataset', 'saola'),\n",
       "              ('seq_id', 'dffd2979-8c29-11eb-acee-000d3a74c7de'),\n",
       "              ('location', 'loc_0385'),\n",
       "              ('images',\n",
       "               [{'id': 'ca5aa776-8c29-11eb-9b92-000d3a74c7de',\n",
       "                 'datetime': '2020-07-13 08:24:51+00:00',\n",
       "                 'frame_num': 0,\n",
       "                 'file': 'lao/loc_0385/2020/07/image_00393.jpg',\n",
       "                 'class': ['large_antlered_muntjac']},\n",
       "                {'id': 'ca5aa777-8c29-11eb-91db-000d3a74c7de',\n",
       "                 'datetime': '2020-07-13 08:24:55+00:00',\n",
       "                 'frame_num': 1,\n",
       "                 'file': 'lao/loc_0385/2020/07/image_00394.jpg',\n",
       "                 'class': ['large_antlered_muntjac']},\n",
       "                {'id': 'ca5aa778-8c29-11eb-ae63-000d3a74c7de',\n",
       "                 'datetime': '2020-07-13 08:24:58+00:00',\n",
       "                 'frame_num': 2,\n",
       "                 'file': 'lao/loc_0385/2020/07/image_00395.jpg',\n",
       "                 'class': ['large_antlered_muntjac']}])])]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample(sequences_short, 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2c - Download the sampled images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "119288"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_to_download = []\n",
    "\n",
    "for seq in sequences_short:\n",
    "    for im in seq['images']:\n",
    "        list_to_download.append('public/' + im['file'] + '\\n')\n",
    "len(list_to_download)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'public/lao/loc_0435/2020/03/image_00050.jpg\\n'"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_to_download[10000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/saola_files.txt', 'w') as f:\n",
    "    f.writelines(list_to_download)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2d - Copy to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 50000/50000 [00:00<00:00, 53647.62it/s]\n"
     ]
    }
   ],
   "source": [
    "path_pairs = []\n",
    "\n",
    "for seq in tqdm(sequences_short):\n",
    "    seq_id = seq['seq_id']\n",
    "    \n",
    "    for im in seq['images']:\n",
    "        frame = im['frame_num']\n",
    "    \n",
    "        src_path = os.path.join(container_root, path_prefix, im['file'])\n",
    "        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', \n",
    "                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "        path_pairs.append((src_path, dst_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "119288"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "[('/mink_disk_0/camtraps/swg-camera-traps/public/lao/loc_0292/2019/03/image_00042.jpg',\n",
       "  '/mink_disk_0/camtraps/imerit12g/saola.seq323bb6f5-8c2a-11eb-b0f1-000d3a74c7de.frame2.jpg'),\n",
       " ('/mink_disk_0/camtraps/swg-camera-traps/public/vietnam/loc_0954/2018/08/image_00162.jpg',\n",
       "  '/mink_disk_0/camtraps/imerit12g/saola.seqf17d1519-8c29-11eb-8401-000d3a74c7de.frame2.jpg')]"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)\n",
    "sample(path_pairs, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 119288/119288 [00:39<00:00, 3000.20it/s] \n"
     ]
    }
   ],
   "source": [
    "for src_path, dst_path in tqdm(path_pairs):\n",
    "    if not os.path.exists(src_path):\n",
    "        old_path = src_path.replace('/mink_disk_0/camtraps/swg-camera-traps/public', '/mink_disk_0/camtraps/swg-camera-traps/swg-camera-traps/public')\n",
    "        try:\n",
    "            _ = copyfile(old_path, src_path)\n",
    "        except IOError as e:\n",
    "            os.makedirs(os.path.dirname(src_path))\n",
    "            _ = copyfile(old_path, src_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 3min 56s, sys: 9min 31s, total: 13min 27s\n",
      "Wall time: 16min 57s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "def copy_file(src_path, dst_path):\n",
    "    if not os.path.exists(dst_path):\n",
    "        return copyfile(src_path, dst_path)\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "with ThreadPool(12) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
